if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')## [conflicted] Will prefer dplyr::intersect over any other package
library(gplots)
library(ggpubr)
library("quantreg") ## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(ggrepel)mytheme = theme_bw(base_size = 10) +
theme(text = element_text(size=10, colour='black'),
title = element_text(size=10, colour='black'),
line = element_line(size=0.5),
axis.title = element_text(size=10, colour='black'),
axis.text = element_text(size=10, colour='black'),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(size=0.5),
strip.background = element_blank(),
strip.text.y = element_blank(),
panel.grid = element_blank(),
panel.border = element_blank(),
legend.position = c(0.8, 0.8),
legend.text=element_text(size=10))
mytheme_discrete_x = mytheme +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
# a colorblind-friendly palette
# colorblind.palette.grey = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')## Rows: 1560 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (26): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ = read_tsv('../data/Supplementary_Table_2_all_circRNAs.txt')## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_6A_precision_values.txt')## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ$tool = factor(all_circ$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
cq$tool = factor(cq$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
val$tool = factor(val$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
cqall_circsee panel 2 scripts
all_circ %>%
mutate(tool_group = ifelse(tool == "Sailfish-cir", 'sfc', 'other')) %>%
ggplot(aes(tool, BSJ_count)) +
geom_boxplot() +
scale_y_log10(labels = scales::comma_format()) +
ylab("BSJ count") +
xlab('') +
facet_wrap(~tool_group, scales = 'free') +
mytheme_discrete_x#ggsave('sup_figures/sup_figure_2.pdf', width = 20, height = 12, units = "cm")count_conc = tibble()
for (tool_1 in val %>% pull(tool) %>% unique()){
for (tool_2 in val %>% pull(tool) %>% unique()){
count_conc = count_conc %>%
bind_rows(all_circ %>% filter(tool == tool_1) %>%
select(cell_line, circ_id, BSJ_count, tool) %>% unique() %>%
rename(BSJ_count_1 = BSJ_count, tool_1 = tool) %>%
inner_join(all_circ %>% filter(tool == tool_2) %>%
select(cell_line, circ_id, BSJ_count, tool) %>% unique() %>%
rename(BSJ_count_2 = BSJ_count, tool_2 = tool)))
}
}## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
## Joining, by = c("cell_line", "circ_id")
count_conc# only keep unique combo's (every combo is calculated twice)
count_conc = count_conc %>%
mutate(tool_comb = paste(tool_1, tool_2, sep = "/"),
tool_comb = map_chr(str_split(tool_comb, "/"), ~str_c(str_sort(unique(.x)), collapse = "/"))) %>%
group_by(tool_comb, circ_id, cell_line) %>%
arrange(tool_1) %>%
filter(row_number()==1) %>%
ungroup()
count_conc %>%
filter(!tool_1 == tool_2) %>%
filter(!tool_1 == "Sailfish-cir", !tool_2 == 'Sailfish-cir') %>%
#filter(tool_1 == "Sailfish-cir" | tool_2 == 'Sailfish-cir') %>%
ggplot(aes(BSJ_count_1, BSJ_count_2)) +
geom_smooth(method = "lm", color = '#00B9F2') +
stat_regline_equation(label.x = -Inf, label.y = Inf, vjust = 1.5, hjust = -0.1, size = 3) +
# stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "*`,`~")),
# label.y= Inf, label.x = -Inf, vjust = 2, hjust = -0.1, size = 3) +
stat_cor(aes(label = ..rr.label..),
label.y= Inf, label.x = -Inf, vjust = 3, hjust = -0.1, size = 3) +
stat_cor(aes(label = ..p.label..),
label.y= Inf, label.x = -Inf, vjust = 5, hjust = -0.1, size = 3) +
mytheme_discrete_x +
theme(aspect.ratio = 1) +
#coord_fixed(ratio = 1) +
xlim(0,4000) + ylim(0,4000) +
#xlim(0,130000) + ylim(0,130000) +
facet_wrap(~tool_comb, nrow = 7) +
xlab('') +
ylab('')## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing missing values (geom_smooth).
get same data in table
count_conc_lm = count_conc %>%
filter(!tool_1 == tool_2) %>%
filter(!tool_1 == "Sailfish-cir", !tool_2 == 'Sailfish-cir') %>%
group_by(tool_comb) %>%
do({
mod = lm(log10(BSJ_count_2) ~ log10(BSJ_count_1), data = .)
#mod = lm(BSJ_count_2 ~ BSJ_count_1, data = .)
data.frame(intercept = coef(mod)[1],
slope = coef(mod)[2],
R_squared = summary(mod)$r.squared)
})
count_conc_lmcount_conc_lm %>%
ggplot(aes(R_squared, slope)) +
geom_point() +
mytheme +
geom_hline(yintercept = 1, color = 'grey') +
geom_vline(xintercept = 1, color = 'grey') +
coord_fixed(ratio = 1)#ggsave('sup_figures/sup_figure_3.pdf', width = 10, height = 15, units = "cm")
# get median value
count_conc_lm %>% pull(slope) %>% quantile()## 0% 25% 50% 75% 100%
## 0.2020794 0.6093260 0.6966336 0.8103344 1.3136941
count_conc_lm %>% pull(R_squared) %>% quantile()## 0% 25% 50% 75% 100%
## 0.1919673 0.6218548 0.7060380 0.7555491 1.0000000
see panel 2 script
Heatmap based on counts here only for count ≥ 5 (in sup figure for all circ) calculate Jaccard index
jac = read_tsv("../data/Supplementary_Table_7_combo_2tools.txt") %>%
filter(cell_line == 'HLF') %>%
mutate(jac_index = nr_intersection/nr_union,
jac_dist = 1 - jac_index) # calculate jac index and distance## Rows: 675 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool_1, tool_2, cell_line, count_group, tool_lt_1, tool_lt_2, tool_...
## dbl (7): nr_union, nr_intersection, perc_compound_val_1, perc_compound_val_2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
jaccount_table = jac %>%
select(tool_1, tool_2, jac_dist) %>%
pivot_wider(values_from = jac_dist, names_from = tool_2)
count_tablern = count_table$tool_1
count_table = count_table %>%
select(-tool_1)
count_matrix = as.matrix(count_table)
rownames(count_matrix) = rn
# pdf("../supplemental/sup_figures/sup_figure_5.pdf", height=8, width=12)
heatmap.2(count_matrix,
main = "HLF - bin - Jaccard",
trace = "none", density.info = "none")# dev.off()all_circ_db = all_circ %>%
mutate(n_db_group = ifelse(n_db == 1, '1 database', '0 databases'),
n_db_group = ifelse(n_db > 1, '2-5 databases', n_db_group),
n_db_group = ifelse(n_db > 5, '6-9 databases', n_db_group),
n_db_group = ifelse(n_db > 9, '≥ 10 databases', n_db_group),
n_db_group = ifelse(n_db == 16, 'all databases', n_db_group),
n_db_group = ifelse(is.na(n_db), '0 databases', n_db_group))
all_circ_db$n_db_group = factor(all_circ_db$n_db_group, levels = c('all databases', '≥ 10 databases', '6-9 databases', "2-5 databases", '1 database', "0 databases"))
all_circ_db$tool = factor(all_circ_db$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
all_circ_db %>%
ggplot(aes(tool, fill = n_db_group)) +
geom_bar() +
scale_y_continuous(labels = scales::comma_format()) +
facet_wrap(~cell_line) +
ylab('number of circRNAs') +
xlab('') +
scale_fill_manual(name = "circRNAs reported by",
values = c("#0072B2", '#00B9F2', '#00A875', "#E69F00", "#CC79A7")) +
mytheme_discrete_x#ggsave('sup_figures/sup_figure_6.pdf', width = 20, height = 9, units = "cm")numbers for paper
all_circ_db %>%
select(circ_id, n_db) %>% unique() %>%
filter(is.na(n_db))round(215885/315312, digits = 3)## [1] 0.685
all_circ_db %>%
group_by(tool) %>%
summarise(perc_db = sum(is.na(n_db)) / n()) %>% arrange(perc_db)all_circ_db %>%
group_by(tool) %>%
summarise(perc_db = sum(is.na(n_db)) / n()) %>% arrange(perc_db) %>%
pull(perc_db) %>% quantile()## 0% 25% 50% 75% 100%
## 0.01620845 0.04856182 0.19688778 0.34802878 0.87837177
all_circ_enst = all_circ %>%
mutate(ENST_group = ifelse(is.na(ENST), NA, 'one match'),
ENST_group = ifelse(ENST == 'ambiguous', 'ambiguous', ENST_group))
all_circ_enst$ENST_group = factor(all_circ_enst$ENST_group, levels = c('one match', 'ambiguous', NA))
all_circ_enst %>%
ggplot(aes(tool, fill = ENST_group)) +
geom_bar(position = 'fill') +
mytheme_discrete_x +
theme(legend.position = "right") +
ylab('% of circRNAs') +
xlab('') +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(name = "",
values = c('#F0E442', '#CC79A7', '#99999'))#ggsave('sup_figures/sup_figure_7.pdf', width = 18, height = 13, units = "cm")calculations
all_circ_enst %>%
select(chr, start, end, strand, ENST_group) %>% unique() %>%
count(ENST_group)round(100 * 216483 / (216483 + 41683 + 144782), 1)## [1] 53.7
round(100 * 41683 / (216483 + 41683 + 144782), 1)## [1] 10.3
round(100 * 144782 / (216483 + 41683 + 144782), 1)## [1] 35.9
all_circ %>%
mutate(estim_len_ex = end - start,
len_type = 'including all exons and introns') %>%
bind_rows(all_circ %>%
filter(!estim_len_ex == 'ambiguous',
!is.na(estim_len_ex)) %>%
mutate(estim_len_ex = as.numeric(estim_len_ex),
len_type = 'including all exons')) %>%
ggplot(aes(tool, estim_len_ex, color = len_type)) +
geom_boxplot(outlier.shape=NA, lwd = 1/.pt) +
mytheme_discrete_x +
scale_color_manual(values = c( '#CC79A7', '#E69F00')) +
theme(legend.title = element_blank(), legend.position = 'right') +
scale_y_log10(labels = scales::comma_format(), expand = expansion(mult = 0.5)) +
coord_cartesian(ylim = c(50, 110000)) +
ylab("estimated length of circRNAs") +
theme(axis.title.x=element_blank())#ggsave('sup_figures/sup_figure_8.pdf', width = 20, height = 10, units = "cm")all_circ_exons = all_circ %>%
filter(!nr_exons == 'ambiguous') %>%
mutate(nr_exons = as.numeric(nr_exons),
exon_group = ifelse(nr_exons == 1, 'single exon', NA),
exon_group = ifelse(nr_exons > 1, '2-5 exons', exon_group),
exon_group = ifelse(nr_exons > 5, '6-9 exons', exon_group),
exon_group = ifelse(nr_exons > 9, '≥ 10 exons', exon_group),
nr_exons = as.character(nr_exons)) %>%
bind_rows(all_circ %>%
filter(nr_exons == 'ambiguous') %>%
mutate(exon_group = ifelse(nr_exons == "ambiguous", 'ambiguous', exon_group))) %>%
bind_rows(all_circ %>%
filter(is.na(nr_exons)) %>%
mutate(exon_group = NA))
all_circ_exons %>% filter(exon_group == "ambiguous")all_circ_exons$exon_group = factor(all_circ_exons$exon_group,
levels = c('≥ 10 exons', '6-9 exons', '2-5 exons', 'single exon', 'ambiguous', NA))
all_circ_exons$tool = factor(all_circ_exons$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
all_circ_exons %>%
#filter(!exon_group == 'ambiguous', !is.na(exon_group)) %>%
ggplot(aes(tool, fill = exon_group)) +
geom_bar(position = 'fill') +
mytheme_discrete_x +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(name = "circRNAs with ...",
values = c("#0072B2", '#00B9F2', '#00A875', "#E69F00", '#CC79A7', '#99999')) +
theme(legend.position = NULL) +
xlab('') +
ylab('')#ggsave('sup_figures/sup_figure_9.pdf', width = 18, height = 13, units = "cm")first calculate strand distribution linear genes
ens = read_tsv('~/Documents/PhD/indexes/Homo_sapiens.GRCh38.103.genes.gtf') %>%
select(gene_id, strand) %>%
rename(circ_id = gene_id) %>%
mutate(tool = 'lineair genes',
tool_group = 'lin')## Rows: 60666 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): chr, database, type, dot, strand, dot2, gene_id, gene_name
## dbl (3): start, end, gene_version
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ensall_circ_strand = all_circ %>%
mutate(tool_group = 'circ') %>%
bind_rows(ens)
all_circ_strand$tool = factor(all_circ_strand$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl", 'lineair genes'))
all_circ_strand %>%
ggplot(aes(tool, fill = strand)) +
geom_bar(position = 'fill') +
ylab('% of circRNAs') +
scale_y_continuous(labels = scales::percent_format()) +
xlab('') +
facet_grid(~tool_group, scales = 'free_x', space = 'free') +
scale_fill_manual(values = c('#009E73', '#CC79A7' , '#5AB4E5')) +
mytheme_discrete_x +
theme(legend.position = 'right')#ggsave('sup_figures/sup_figure_10.pdf', width = 20, height = 12, units = "cm")